set scheme stcolor

********************************************************************************
* HYPOTHESIS-LEVEL ANALYSIS
********************************************************************************

	use "../Metadata/tess_analysisdata.dta", clear

* make variables	
		recode title (1 2 6=0) (3/5=1), gen(faculty)
		lab var faculty "Faculty PI"

		gen polsci = (discipline == 5)
		lab var polsci "Political science"

		recode numauthors (1=0) (2/10=1), gen(authors2)
		lab var authors2 "Co-authored"

		recode check_type (0=0) (1/3=1), gen(checks)
		lab var checks "Any checks"

		recode num_measures (1=0) (2/50=1), gen(multiitem)
		lab var multiitem "Multiple items"
		
		recode outcome_categories (2=1) (3/8=0), gen(binarydv)
		lab var binarydv "Binary DV"

		recode outcome_categories (2/7=0) (8=1), gen(continuousdv)
		lab var continuousdv "Continuous DV"		
		
	* relabel var for length
		lab var type_framing "Framing exp."
		lab var type_priming "Priming exp."
		lab var type_vignette "Vignette exp."
		lab var type_questwording "Question-wording exp."
		lab var type_information "Information provision exp."
		lab var type_beliefelicit "Belief elicitation exp."
		lab var type_writing "Writing treatment"
		lab var type_conj_dce "Conjoint/Discrete choice exp."
		
* keep insample obs
		keep if insample==1

* set random sort order
		set seed 10101
		gen random=runiform()
		sort random


* how many iterations do we need?
		gen outofbagerror1=.
		gen validationerror1=.
		gen iter1=.

		local j=0
		forval i=100(10)1000 {
			local j=`j'+1
			
		rforest hyp_true ///
			N_person ///
			medtime15  ///
			type_beliefelicit type_conj_dce type_framing type_information ///
			type_priming type_questwording type_vignette type_writing ///
			mediation moderation hyp_t hyp_ks hyp_default ///
			woman ///
			faculty polsci shortstudy ///
			pilot clarity ///
			checks ///	
			binarydv continuousdv multiitem ///
			authors2 ///
			panel ///
			in 1/506, ///
			 iter(`j') type(class) numvars(1)
			 
			 
			 qui replace iter1= `i' in `j'
			 qui replace outofbagerror1= `e(OOB_Error)' in `j'
			 predict p in 507/1012
			 qui replace validationerror1 = `e(error_rate)' in `j'
			 drop p
		}


			label var outofbagerror1 "Out of Bag Error"
			label var iter1 "Iterations"
			label var validationerror1 "Validation Error"
			scatter outofbagerror1 iter1, mcolor(blue) msize(tiny) || ///
			scatter validationerror1 iter1, mcolor(red) msize(tiny)

			// OOB and validation error minimize around 300 iterations

	
* how many variables at a time?
		gen outofbagerror2=.
		gen validationerror2=.
		gen nvars=.

		local j=0
		forval i=1(1)26 {
			local j=`j'+1
			
		rforest hyp_true ///
			N_person ///
			medtime15  ///
			type_beliefelicit type_conj_dce type_framing type_information ///
			type_priming type_questwording type_vignette type_writing ///
			mediation moderation hyp_t hyp_ks hyp_default ///
			woman ///
			faculty polsci shortstudy ///
			pilot clarity ///
			checks ///	
			binarydv continuousdv multiitem ///
			authors2 ///
			panel ///
			in 1/506, ///
			 iter(300) type(class) numvars(`i')
			 
			 
			 qui replace nvars= `i' in `j'
			 qui replace outofbagerror2= `e(OOB_Error)' in `j'
			 predict p in 507/1012
			 qui replace validationerror2 = `e(error_rate)' in `j'
			 drop p
		}


			label var outofbagerror2 "Out of Bag Error"
			label var nvars "No. of variables"
			label var validationerror2 "Validation Error"
			scatter outofbagerror2 nvars, mcolor(blue) msize(tiny) || ///
			scatter validationerror2 nvars, mcolor(red) msize(tiny)
			
		// assess lowest validation error rate	
		cap frame drop mydata 
		frame put validationerror2 outofbagerror2 nvars, into(mydata)  
		frame mydata { 
			gen totalerror=validationerror2 +outofbagerror2
			sort totalerror, stable 
			local min_val_err = validationerror2[1] 
			local min_nvars = nvars[1] 
			} 

		di "Minimum error ="  `min_val_err' 
		di "Corresponding number of variables =" `min_nvars' 
		// 4 vars
	
		frame drop mydata 
	
* estimate hyp-level model
		rforest hyp_true ///
			N_person ///
			medtime15  ///
			type_beliefelicit type_conj_dce type_framing type_information ///
			type_priming type_questwording type_vignette type_writing ///
			mediation moderation hyp_t hyp_ks hyp_default ///
			woman ///
			faculty polsci shortstudy ///
			pilot clarity ///
			checks ///	
			binarydv continuousdv multiitem ///
			authors2 ///
			panel ///
			in 1/506, ///
			iter(300) type(class) numvars(4)
		
		di e(OOB_Error) 
		cap drop p
		
		predict p in 507/1012
		di e(error_rate) 

* variable importance matrix
		matrix importance = e(importance)
		matrix list importance

		// create a variable with importance scores from matrix
		cap drop importance
		svmat importance

		// create a variable with row names from matrix
		cap drop importid
		generate importid= ""
		local names: rownames importance 
		local k: word count `names'
		if `k'>_N {
			set obs `k'
		}
		forval i=1(1)`k' {
			local aword: word `i' of `names'
			local alabel: var label `aword'
			if ("`alabel'"!="") qui replace importid = "`alabel'" in `i'
			else qui replace importid= "`aword'" in `i'
		}



// to add directional arrows: create a variable with correlations of var with success
		correl hyp_true ///
			N_person ///
			medtime15  ///
			type_beliefelicit type_conj_dce type_framing type_information ///
			type_priming type_questwording type_vignette type_writing ///
			mediation moderation hyp_t hyp_ks hyp_default ///
			woman ///
			faculty polsci shortstudy ///
			pilot clarity ///
			checks ///	
			binarydv continuousdv multiitem ///
			authors2 ///
			panel 
	

		matrix corr = r(C)
		matrix list corr
		matselrc corr corr1, r(2/28) c(1/1)
		matrix list corr1

		// create a new variable with correlations from matrix
		cap drop corr1
		svmat corr1

		cap drop direction
		gen direction=""
		replace direction= " ({&uarr})" if corr11>0 & corr11!=.
		replace direction= " ({&darr})" if corr11<0 & corr11!=.

		// concat importid and direction
		cap drop labels
		egen labels = concat(importid direction)


* Hyp level importance score chart	
		graph hbar (mean) importance1, ///
		over(labels, sort(1) descending label(labsize(4)) axis(noline)) ///
		bar(1, fcolor(stc1*.6) lc(none)) ///
		ytitle(importance1) ///
		ytitle("Importance score", size(3.5) margin(l=-8)) ///
		ylabel(, labsize(3.5) nogrid) ///
		graphregion(color(white)) scheme(stcolor) ///
		title("{it: Hypothesis-level support}", span) ///
		ysize(3) xsize(2)		
		graph export ../Results/Figure1_left.tif, replace

		
		
********************************************************************************		
* STUDY LEVEL ANALYSIS
********************************************************************************
		
use "../Metadata/tess_analysisdata.dta", clear

* make variables	
		recode title (1 2 6=0) (3/5=1), gen(faculty)
		lab var faculty "Faculty PI"

		gen polsci = (discipline == 5)
		lab var polsci "Political science"

		recode numauthors (1=0) (2/10=1), gen(authors2)
		lab var authors2 "Co-authored"

		recode check_type (0=0) (1/3=1), gen(checks)
		lab var checks "Any checks"
	

		* relabel var for length
			lab var type_framing "Framing exp."
			lab var type_priming "Priming exp."
			lab var type_vignette "Vignette exp."
			lab var type_questwording "Question-wording exp."
			lab var type_information "Information provision exp."
			lab var type_beliefelicit "Belief elicitation exp."
			lab var type_writing "Writing treatment"
			lab var type_conj_dce "Conjoint/Discrete choice exp."
			
	
		* average level of hyp-level vars
			* at least one binary DV
			recode outcome_categories (2=1) (3/8=0), gen(bin)
			egen binarydv= mean(bin), by(vendor_id)
			lab var binarydv "Binary DV (at least 1)"

			recode outcome_categories (2/7=0) (8=1), gen(cont)
			egen continuousdv= mean(cont), by(vendor_id)
			lab var continuousdv "Continuous DV (at least 1)"				
				
			* number of items used to measure DV, on average across the study
			egen avgnummeasures= mean(num_measures), by(vendor_id)
			forval i=1/5 {
			lab var avgnummeasures "Avg. # of items per DV"
			}
			
			* whether the study had any of these types of hypotheses
			tab hyp_type, gen(hypcat)
			forval i=1/5 {	
			egen  hypcatany`i'=max(hypcat`i'), by(vendor_id)	
			}
			lab var hypcatany1 "Default treat. comp. (at least 1)"
			lab var hypcatany2 "Moderation test (at least 1)"
			lab var hypcatany3 "Mediation test (at least 1)"
			lab var hypcatany4 "t-test (at least 1)"
			lab var hypcatany5 "KS-test (at least 1)"	

			lab var medtime15 "Median duration (mins.)"
			
			
		* study-level analysis	
		keep if hyp_num==1	
	
* set seed
		set seed 10101
		generate u = runiform()
		sort u

* how many iterations do we need?
		gen outofbagerror1=.
		gen validationerror1=.
		gen iter1=.

		local j=0
		forval i=100(10)500 {
			local j=`j'+1
			
		rforest successfulexp_insample ///
			samplesize ///
			medtime15  ///
			type_beliefelicit type_conj_dce type_framing type_information ///
			type_priming type_questwording type_vignette type_writing ///
			hypcatany* ///
			woman ///
			faculty polsci shortstudy ///
			pilot clarity ///
			checks ///
			binarydv continuousdv avgnummeasures ///
			authors2 ///
			in 1/60, ///
			 iter(`j') type(class) numvars(1)
			 
			 
			 qui replace iter1= `i' in `j'
			 qui replace outofbagerror1= `e(OOB_Error)' in `j'
			 predict p in 61/100
			 qui replace validationerror1 = `e(error_rate)' in `j'
			 drop p
		}


			label var outofbagerror1 "Out of Bag Error"
			label var iter1 "Iterations"
			label var validationerror1 "Validation Error"
			scatter outofbagerror1 iter1, mcolor(blue) msize(tiny) || ///
			scatter validationerror1 iter1, mcolor(red) msize(tiny)
				// 400 iterations

		* how many variables at a time?
		gen outofbagerror2=.
		gen validationerror2=.
		gen nvars=.

		local j=0
		forval i=1(1)25 {
			local j=`j'+1
			
		rforest successfulexp_insample ///
			samplesize ///
			medtime15  ///
			type_beliefelicit type_conj_dce type_framing type_information ///
			type_priming type_questwording type_vignette type_writing ///
			hypcatany* ///
			woman ///
			faculty polsci shortstudy ///
			pilot clarity ///
			checks ///
			binarydv continuousdv avgnummeasures ///
			authors2 ///
			in 1/60, ///
			 iter(400) type(class) numvars(`i')
			 
			 
			 qui replace nvars= `i' in `j'
			 qui replace outofbagerror2= `e(OOB_Error)' in `j'
			 predict p in 61/100
			 qui replace validationerror2 = `e(error_rate)' in `j'
			 drop p
		}


			label var outofbagerror2 "Out of Bag Error"
			label var nvars "No. of variables"
			label var validationerror2 "Validation Error"
			scatter outofbagerror2 nvars, mcolor(blue) msize(tiny) || ///
			scatter validationerror2 nvars, mcolor(red) msize(tiny)


		// assess lowest validation error rate	
		cap frame drop mydata 
		frame put validationerror2 outofbagerror2 nvars, into(mydata)  
		frame mydata { 
			gen totalerror=validationerror2 +outofbagerror2
			sort totalerror, stable 
			local min_val_err = validationerror2[1] 
			local min_nvars = nvars[1] 
			} 

		di "Minimum error ="  `min_val_err' 
		di "Corresponding number of variables =" `min_nvars' 
		// 1
			
		cap frame drop mydata 

* estimate model
		rforest successfulexp_insample ///
			samplesize ///
			medtime15  ///
			type_beliefelicit type_conj_dce type_framing type_information ///
			type_priming type_questwording type_vignette type_writing ///
			hypcatany* ///
			woman ///
			faculty polsci shortstudy ///
			pilot clarity ///
			checks ///
			binarydv continuousdv avgnummeasures ///
			authors2 ///
			, ///
			iter(400) type(class) numvars(1)
		
		di e(OOB_Error) 
		cap drop p 
		
		predict p
		di e(error_rate) // there is no error, because all data is used



* variable importance matrix
		matrix importance = e(importance)
		matrix list importance

		// create a variable with importance scores from matrix
		cap drop importance
		svmat importance

		// create a variable with row names from matrix
		cap drop importid
		generate importid= ""
		local names: rownames importance 
		local k: word count `names'
		if `k'>_N {
			set obs `k'
		}
		forval i=1(1)`k' {
			local aword: word `i' of `names'
			local alabel: var label `aword'
			if ("`alabel'"!="") qui replace importid = "`alabel'" in `i'
			else qui replace importid= "`aword'" in `i'
		}


// to add directional arrows: create a variable with correlations of var with success
		correl successfulexp_insample ///
			samplesize ///
			medtime15  ///
			type_beliefelicit type_conj_dce type_framing type_information ///
			type_priming type_questwording type_vignette type_writing ///
			hypcatany* ///
			woman ///
			faculty polsci shortstudy ///
			pilot clarity ///
			checks ///
			binarydv continuousdv avgnummeasures ///
			authors2 
		

			matrix corr = r(C)
			matrix list corr
			matselrc corr corr1, r(2/27) c(1/1)
			matrix list corr1

			// create a new variable with correlations from matrix
			cap drop corr1
			svmat corr1

			cap drop direction
			gen direction=""
			replace direction=" ({&uarr})" if corr11>0 & corr11!=.
			replace direction=" ({&darr})" if corr11<0 & corr11!=.

			// concat importid and direction
			cap drop labels
			egen labels = concat(importid direction)



* Importance score chart
		graph hbar (mean) importance1, ///
		over(labels, sort(1) descending label(labsize(4)) axis(noline)) ///
		bar(1, fcolor(stc2*.6) lc(none)) ///
		ytitle(importance1) ///
		ytitle("Importance score", size(3.5) margin(l=-8)) ///
		ylabel(, labsize(3.5) nogrid) ///
		graphregion(color(white)) scheme(stcolor) ///
		title("{it: Study-level positive result}", span) ///
		ysize(3) xsize(2)
		graph export ../Results/Figure1_right.tif, replace

